%pip install -q numpy matplotlib scipy faiss-cpu librosa pandas

Python interpreter will be restarted.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
petastorm 0.11.4 requires pyspark>=2.1.0, which is not installed.
pandas-profiling 3.1.0 requires joblib~=1.0.1, but you have joblib 1.4.2 which is incompatible.
mleap 0.20.0 requires scikit-learn<0.23.0,>=0.22.0, but you have scikit-learn 1.6.1 which is incompatible.
Python interpreter will be restarted.

import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import faiss
import pandas as pd
from scipy.signal import spectrogram

# Path to the audio files directory
audio_directory = "/curated/AudioStore/audios/"

# Function to get list of audio files in the directory
def get_audio_files(directory):
    return [f for f in os.listdir(directory) if f.endswith(".mp3")]

# Function to extract spectrogram as embeddings
def extract_spectrogram_embedding(audio_file):
    # Load audio file using librosa
    y, sr = librosa.load(audio_file, sr=None)
    
    # Compute the spectrogram
    f, t, Sxx = spectrogram(y, sr)
    
    # Convert to log scale for better feature representation
    Sxx = np.log(Sxx + 1e-7)
    
    # Take mean across time dimension to reduce the dimensionality
    embedding = np.mean(Sxx, axis=1)
    return embedding

# Get all audio files from the specified directory
audio_files = get_audio_files(audio_directory)

# Extract embeddings from each audio file
embeddings = [extract_spectrogram_embedding(os.path.join(audio_directory, file)) for file in audio_files]

# Create FAISS index to store and retrieve embeddings
def create_faiss_index(embeddings):
    embeddings = np.array(embeddings).astype('float32')
    dim = embeddings.shape[1]  # The dimensionality of the embeddings
    index = faiss.IndexFlatL2(dim)  # Using L2 distance metric
    index.add(embeddings)  # Add embeddings to the index
    return index

# Create the FAISS index
faiss_index = create_faiss_index(embeddings)

# Function to retrieve similar audio based on a query
def retrieve_similar_audio(query_file, faiss_index, k=2):
    query_embedding = extract_spectrogram_embedding(query_file)
    query_embedding = np.array(query_embedding).astype('float32').reshape(1, -1)
    distances, indices = faiss_index.search(query_embedding, k)  # Retrieve top-k nearest neighbors
    return indices, distances

# Query example: Assume the user asks for a specific audio file (e.g., "dog_bark.mp3")
query_audio_file = os.path.join(audio_directory, "duskwolf_101348.mp3")  
indices, distances = retrieve_similar_audio(query_audio_file, faiss_index, k=2)

# Function to display audio table in markdown format
def display_audio_table(data, title):
    df = pd.DataFrame(data, columns=['Index', 'Audio', 'Embedding (First 5 embeddings)'])
    print(f"\n{title}")
    print(df.to_markdown(index=False))  # Use markdown to get nice table formatting

# Display the results in a table
def display_results(audio_files, embeddings, indices, query_audio_file):
    #print(f"\nQuery Audio File: {query_audio_file}")
    
    # Prepare data for original audio table (first 5 embeddings)
    original_audio_data = [(i, audio_files[i], embeddings[i][:5]) for i in range(len(audio_files))]
    display_audio_table(original_audio_data, "Original Audio Files and Embeddings:")
    
    # Display User Query Audio Waveform
    plot_query_audio_waveform(query_audio_file)

    # Prepare data for predicted audio table (first 5 embeddings)
    predicted_audio_files = [audio_files[idx] for idx in indices[0]]
    predicted_embeddings = [embeddings[idx] for idx in indices[0]]
    predicted_audio_data = [(i, predicted_audio_files[i], predicted_embeddings[i][:5]) for i in range(len(predicted_audio_files))]
    display_audio_table(predicted_audio_data, "Predicted Audio Files and Embeddings:")

    # Display Predicted Audio Waveforms side by side
    plot_predicted_audio_waveforms(audio_files, indices)

# Plot the User Query Audio Waveform
def plot_query_audio_waveform(query_audio_file):
    # Extract just the file name (no path)
    query_audio_name = os.path.basename(query_audio_file)
    
    # Plot the query audio waveform
    y, sr = librosa.load(query_audio_file, sr=None)
    plt.figure(figsize=(6, 4))  # Adjust figure size
    plt.plot(y)
    plt.title(f"User Query Audio Waveform: {query_audio_name}")  # Use only file name
    plt.xlabel("Sample Number")
    plt.ylabel("Amplitude")
    plt.show()

# Plot the Predicted Audio Waveforms side by side
def plot_predicted_audio_waveforms(audio_files, indices):
    # Plot predicted audio waveforms side by side
    fig, axes = plt.subplots(1, len(indices[0]), figsize=(12, 4))  # Adjust the number of subplots based on k (number of predicted files)
    if len(indices[0]) == 1:
        axes = [axes]  # To handle the case where there's only one predicted audio

    for i, idx in enumerate(indices[0]):
        retrieved_audio_file = audio_files[idx]
        retrieved_audio_name = os.path.basename(retrieved_audio_file)  # Extract file name only
        y, sr = librosa.load(os.path.join(audio_directory, retrieved_audio_file), sr=None)
        axes[i].plot(y)
        axes[i].set_title(f"Predicted Audio Waveform: {retrieved_audio_name}")  # Use only file name
        axes[i].set_xlabel("Sample Number")
        axes[i].set_ylabel("Amplitude")
    plt.tight_layout()  # To ensure that the subplots don't overlap
    plt.show()

# Display results
display_results(audio_files, embeddings, indices, query_audio_file)

Original Audio Files and Embeddings:
|   Index | Audio                              | Embedding (First 5 embeddings)                                |
|--------:|:-----------------------------------|:--------------------------------------------------------------|
|       0 | diesel_mercedes_190_d_33940.mp3    | [-15.053708 -12.10706  -12.368397 -13.738781 -14.303338]      |
|       1 | dog1_small_barking_angirly.mp3     | [-15.80812  -15.75914  -15.568337 -15.184358 -14.459221]      |
|       2 | dog2_small_dog_barking.mp3         | [-15.837855 -15.630339 -15.422517 -14.892724 -15.095776]      |
|       3 | dog3_barking.mp3                   | [-15.591373 -15.58906  -15.320949 -14.866169 -14.144432]      |
|       4 | duskwolf_101348.mp3                | [-15.414704 -15.892582 -15.705197 -15.354351 -14.447784]      |
|       5 | truck1_Hyundai_Tractor_Engine.mp3  | [-13.268114  -9.427747 -11.069034 -11.546685  -9.743669]      |
|       6 | truck2_Dododge.mp3                 | [-15.470654 -12.924455 -12.648085 -13.403902 -14.288434]      |
|       7 | truck_diesel_07dodge_rev_98278.mp3 | [-15.307348 -13.277036 -12.684735 -13.181681 -14.059005]      |
|       8 | wol3_wolves.mp3                    | [-16.056688 -16.060814 -15.998393 -15.741171 -15.141277]      |
|       9 | wolf1_howling.mp3                  | [-15.400748 -14.87836  -13.977464 -13.397399 -15.081363]      |
|      10 | wolf2_howling_wolves.mp3           | [-15.7981415 -15.991037  -15.964015  -15.954524  -15.771357 ] |

Predicted Audio Files and Embeddings:
|   Index | Audio               | Embedding (First 5 embeddings)                           |
|--------:|:--------------------|:---------------------------------------------------------|
|       0 | duskwolf_101348.mp3 | [-15.414704 -15.892582 -15.705197 -15.354351 -14.447784] |
|       1 | dog3_barking.mp3    | [-15.591373 -15.58906  -15.320949 -14.866169 -14.144432] |

Audio Embedding¶